rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  625222 33.4    1434830 76.7   702048 37.5
## Vcells 1171653  9.0    8388608 64.0  1927980 14.8
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)


`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )

Note: be careful with 35.2 bin matches

3 Abbreviations

Plant Name Label JCVI-MCScan Compara Plants Plaza OrthoDB FastOma RBH Mercator
Malus domestica apple mdo_GDv1 malus_domestica_golden mdo mdo mdo mdo mdo
mdo_HChap1
Prunus persica ppe ppe prunus_persica ppe ppe pper ppe pper
Prunus dulcis / P. amygdalus almond almond prunus_dulcis pdul pdul pdul pdul
Prunus avium wild cherry wildcherry prunus_avium pavi pavi pavi pavi
Prunus armeniaca apricot apricot parm parm parm parm
Prunus cerasifera cherry plum cherryplum pcer pcer pcer
Pyrus pear pear pcox pcox pcox
Prunus sibirica Siberian apricot siberianapricot psib psib psib
group = "fruitTrees"
params_list <- list(
  plantName = 'mdo'
  , plantNameOut = "apple"
  , plantDirOut = file.path('..', 'reports', group, "apple")
  , flag = 1
)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001614683e848>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-11] | |…… | 11% | |…….. | 15% [unnamed-chunk-12] | |……… | 19% | |……….. | 22% [unnamed-chunk-13] | |…………. | 26% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-15] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-16] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-17] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-18] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-19] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-20] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-21] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-22] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-23] | |……………………………………………| 100%

cat(child_content)

4 Subsection: mdo

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   20997   77128   32036   56069   30068   37682
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID                   to_plant source 
##    <chr>       <chr>      <chr>                       <chr>    <chr>  
##  1 AT4G10330   ath        Maldo.hc.v1a1.ch10A.g00003  mdo      FastOMA
##  2 AT4G10340   ath        Maldo.hc.v1a1.ch10A.g00004  mdo      FastOMA
##  3 AT5G44410   ath        g1                          mdo      FastOMA
##  4 AT5G44440   ath        g1                          mdo      FastOMA
##  5 AT1G03770   ath        Maldo.hc.v1a1.ch10A.g02825  mdo      MCScanX
##  6 AT1G03800   ath        Maldo.hc.v1a1.ch10A.g02815  mdo      MCScanX
##  7 ATMG00910   ath        Maldo.hc.v1a1.sc45A.g49878  mdo      MCScanX
##  8 ATMG01020   ath        Maldo.hc.v1a1.sc45A.g49883  mdo      MCScanX
##  9 AT2G46320   ath        Maldo.hc.v1a1.ch1A.g26266   mdo      OrthoDB
## 10 AT4G27940   ath        Maldo.hc.v1a1.ch1A.g26266   mdo      OrthoDB
## 11 AT2G07695   ath        Maldo.hc.v1a1.sc164A.g48922 mdo      OrthoDB
## 12 AT2G07695   ath        Maldo.hc.v1a1.sc119A.g48697 mdo      OrthoDB
## 13 AT2G28190   ath        Maldo.hc.v1a1.ch6A.g38979   mdo      PLAZA  
## 14 AT2G07732   ath        Maldo.hc.v1a1.sc90A.g49976  mdo      PLAZA  
## 15 AT5G17400   ath        Maldo.hc.v1a1.ch9A.g48118   mdo      PLAZA  
## 16 AT2G47300   ath        Maldo.hc.v1a1.ch17A.g24110  mdo      PLAZA  
## 17 AT1G01020   ath        Maldo.hc.v1a1.ch7A.g41990   mdo      RBH    
## 18 AT1G01030   ath        Maldo.hc.v1a1.ch1A.g25187   mdo      RBH    
## 19 ATMG01410   ath        Maldo.hc.v1a1.sc36A.g49760  mdo      RBH    
## 20 ATMG01410   ath        Maldo.hc.v1a1.sc71A.g49908  mdo      RBH    
## 21 AT4G39400   ath        Maldo.hc.v1a1.ch2A.g27501   mdo      compara
## 22 AT4G39400   ath        Maldo.hc.v1a1.ch15A.g17036  mdo      compara
## 23 AT1G65880   ath        Maldo.hc.v1a1.ch17A.g24066  mdo      compara
## 24 AT5G52820   ath        Maldo.hc.v1a1.ch17A.g24067  mdo      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2241383 119.8    4354065 232.6  3000861 160.3
## Vcells 19974681 152.4   32427968 247.5 32406934 247.3

4.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23046
length(unique(dt$to_geneID))
## [1] 34410
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   20997   77128   32036   56069   30068   37682
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ComplexUpset package.
##   Please report the issue at
##   <https://github.com/krassowski/complex-upset/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

4.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
##  FALSE   TRUE 
## 117481     27
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##  [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   8145738 435.1   14894365  795.5  14612893  780.5
## Vcells 127017683 969.1  200926043 1533.0 200893451 1532.7

4.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

4.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23046
length(unique(df$to_geneID))
## [1] 34410
range(df$from_count)
## [1]   1 128
range(df$to_count)
## [1]   1 116
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 319
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 288
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          95265          13328           8915
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       47630    10916           7960
##   2       13082     1663            576
##   3        8862      473            178
##   4        8957      150            104
##   5       10128       96             69
##   6        6606       30             28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            5384            1939            4274
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                5278                4274                1240                1070 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               32036                 579                5384                 667 
##               PLAZA         RBH_MapMan4              reject 
##                6568                1939               58473
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

4.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          53202           3640           2193
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       14805     1615           1530
##   2        7302     1354            341
##   3        6601      402            135
##   4        8013      144             95
##   5        9875       95             64
##   6        6606       30             28
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 21143
length(unique(kept$to_geneID))
## [1] 33288
range(kept$from_count)
## [1]  1 93
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 30
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 27
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

4.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 163                  94                  71                  35 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                1336                  26                 122                  30 
##               PLAZA         RBH_MapMan4              reject 
##                 264                  36                1768
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'ppe'
  , plantNameOut = "peach"
  , plantDirOut = file.path('..', 'reports', group, "peach")
  , flag = 1
)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001618c41fae0>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-39] | |…… | 11% | |…….. | 15% [unnamed-chunk-40] | |……… | 19% | |……….. | 22% [unnamed-chunk-41] | |…………. | 26% | |…………… | 30% [unnamed-chunk-42] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-43] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-44] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-45] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-46] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-47] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-48] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-49] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-50] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-51] | |……………………………………………| 100%

cat(child_content)

5 Subsection: ppe

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   16193   44006   17894   38370   20791   24564
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT1G12040   ath        Prupe.1G000500 ppe      FastOMA
##  2 AT1G62440   ath        Prupe.1G000500 ppe      FastOMA
##  3 AT1G61010   ath        Prupe.I006100  ppe      FastOMA
##  4 AT1G61010   ath        Prupe.I006200  ppe      FastOMA
##  5 AT1G04230   ath        Prupe.1G027200 ppe      MCScanX
##  6 AT1G04240   ath        Prupe.1G027500 ppe      MCScanX
##  7 ATCG00530   ath        Prupe.7G029500 ppe      MCScanX
##  8 ATCG00680   ath        Prupe.7G029800 ppe      MCScanX
##  9 AT3G17900   ath        Prupe.1G267800 ppe      OrthoDB
## 10 AT4G35230   ath        Prupe.1G355500 ppe      OrthoDB
## 11 AT2G07675   ath        Prupe.6G146800 ppe      OrthoDB
## 12 ATMG00980   ath        Prupe.6G146800 ppe      OrthoDB
## 13 AT3G02890   ath        Prupe.2G329000 ppe      PLAZA  
## 14 AT5G16680   ath        Prupe.2G329000 ppe      PLAZA  
## 15 AT4G03170   ath        Prupe.4G260600 ppe      PLAZA  
## 16 AT4G03160   ath        Prupe.4G260600 ppe      PLAZA  
## 17 AT1G01030   ath        Prupe.5G134900 ppe      RBH    
## 18 AT1G01040   ath        Prupe.2G200900 ppe      RBH    
## 19 ATMG01250   ath        Prupe.6G123900 ppe      RBH    
## 20 ATMG01250   ath        Prupe.7G164000 ppe      RBH    
## 21 AT5G01010   ath        Prupe.6G300700 ppe      compara
## 22 AT5G01020   ath        Prupe.6G300300 ppe      compara
## 23 AT1G80940   ath        Prupe.3G103600 ppe      compara
## 24 AT1G80950   ath        Prupe.1G382400 ppe      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6160929  329.1   11915492  636.4  14894365  795.5
## Vcells 135778538 1036.0  232148417 1771.2 200925340 1533.0

5.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23113
length(unique(dt$to_geneID))
## [1] 21245
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   16193   44006   17894   38370   20791   24564
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

5.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 71222   231
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
##   [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
##   [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
##  [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
##  [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
##  [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
##  [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  6312945 337.2   11915492  636.4  14894365  795.5
## Vcells 91750182 700.0  232148417 1771.2 232142054 1771.2

5.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

5.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23113
length(unique(df$to_geneID))
## [1] 21245
range(df$from_count)
## [1]  1 57
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 264
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 135
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          60359           6360           4734
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       29055     5410           4119
##   2        8732      560            368
##   3        5361      185            103
##   4        5521      103             68
##   5        6890       68             51
##   6        4800       34             25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            4153            1191            2083
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                5084                2083                1109                 232 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               17894                 415                4153                 516 
##               PLAZA         RBH_MapMan4              reject 
##                4926                1191               33850
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

5.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          34638           1556           1409
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1        9947      853            970
##   2        4843      366            219
##   3        3685      143             81
##   4        4775       92             63
##   5        6588       68             51
##   6        4800       34             25
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20622
length(unique(kept$to_geneID))
## [1] 20886
range(kept$from_count)
## [1]  1 50
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 7
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 15
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

5.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 145                  38                  43                   7 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 728                  27                  53                  16 
##               PLAZA         RBH_MapMan4              reject 
##                 166                  21                1150
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pdul'
  , plantNameOut = "almond"
  , plantDirOut = file.path('..', 'reports', group, "almond")
  , flag = 2
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000161b861e7a0>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-67] | |…… | 11% | |…….. | 15% [unnamed-chunk-68] | |……… | 19% | |……….. | 22% [unnamed-chunk-69] | |…………. | 26% | |…………… | 30% [unnamed-chunk-70] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-71] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-72] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-73] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-74] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-75] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-76] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-77] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-78] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-79] | |……………………………………………| 100%

cat(child_content)

6 Subsection: pdul

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

6.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##   15975   42927   20148   35734   24829
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT2G05620   ath        TexasF1_G1000  pdul     FastOMA
##  2 AT3G48880   ath        TexasF1_G10001 pdul     FastOMA
##  3 AT3G48890   ath        TexasF1_G9999  pdul     FastOMA
##  4 AT5G52240   ath        TexasF1_G9999  pdul     FastOMA
##  5 AT1G04220   ath        TexasF1_G767   pdul     MCScanX
##  6 AT1G04230   ath        TexasF1_G779   pdul     MCScanX
##  7 ATCG00170   ath        TexasF1_G22620 pdul     MCScanX
##  8 ATCG00500   ath        TexasF1_G22624 pdul     MCScanX
##  9 AT1G23390   ath        TexasF1_G3359  pdul     OrthoDB
## 10 AT5G19210   ath        TexasF1_G2060  pdul     OrthoDB
## 11 AT3G51810   ath        TexasF1_G23162 pdul     OrthoDB
## 12 AT2G28815   ath        TexasF1_G6420  pdul     OrthoDB
## 13 AT1G01030   ath        TexasF1_G18833 pdul     RBH    
## 14 AT1G01040   ath        TexasF1_G9057  pdul     RBH    
## 15 ATMG00860   ath        TexasF1_G25095 pdul     RBH    
## 16 ATMG01250   ath        TexasF1_G22012 pdul     RBH    
## 17 AT4G37540   ath        TexasF1_G22582 pdul     compara
## 18 AT5G67420   ath        TexasF1_G22582 pdul     compara
## 19 AT4G15960   ath        TexasF1_G27715 pdul     compara
## 20 AT3G45140   ath        TexasF1_G6796  pdul     compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  5092078 272.0   11915492  636.4  14894365  795.5
## Vcells 96743945 738.1  232148417 1771.2 232142054 1771.2

6.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22451
length(unique(dt$to_geneID))
## [1] 20903
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##   15975   42927   20148   35734   24829
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

6.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

6.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

6.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

6.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "compara"         "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "athName"         "athSynonims"     "all_pathways"   
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 67030
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4648164 248.3   11915492  636.4  14894365  795.5
## Vcells 58956976 449.9  185724412 1417.0 232155515 1771.3

6.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

6.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22451
length(unique(df$to_geneID))
## [1] 20903
range(df$from_count)
## [1]   1 150
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 171
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 120
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          54341           7625           5064
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       26684     6154           4134
##   2        8165      848            468
##   3        5584      289            176
##   4        6775      187            146
##   5        7133      147            140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            4025            1664            2469
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                4234                2469                1038                 583 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               20148                 722                4025                 692 
##         RBH_MapMan4              reject 
##                1664               31455
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

6.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          31939           1865           1771
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1        9222      690           1054
##   2        4772      600            297
##   3        4402      249            144
##   4        6410      179            136
##   5        7133      147            140
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20096
length(unique(kept$to_geneID))
## [1] 20187
range(kept$from_count)
## [1]  1 51
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 5
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 11
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

6.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 118                  52                  53                  25 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 843                  42                  69                  19 
##         RBH_MapMan4              reject 
##                  40                1144
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pavi'
  , plantNameOut = "wildcherry"
  , plantDirOut = file.path('..', 'reports', group, "wildcherry")
  , flag = 2
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001614e2ac588>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-95] | |…… | 11% | |……. | 15% [unnamed-chunk-96] | |……… | 19% | |……….. | 22% [unnamed-chunk-97] | |…………. | 26% | |…………… | 30% [unnamed-chunk-98] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-99] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-100] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-101] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-102] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-103] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-104] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-105] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-106] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-107] | |…………………………………………..| 100%

cat(child_content)

7 Subsection: pavi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

7.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##    4367   45924   19708   38228   25594
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_plant to_geneID  to_plant source 
##    <chr>       <chr>      <chr>      <chr>    <chr>  
##  1 AT1G12040   ath        FUN_000050 pavi     FastOMA
##  2 AT1G62440   ath        FUN_000050 pavi     FastOMA
##  3 AT2G43840   ath        FUN_040335 pavi     FastOMA
##  4 AT2G44050   ath        FUN_040336 pavi     FastOMA
##  5 AT1G04220   ath        FUN_000300 pavi     MCScanX
##  6 AT1G04230   ath        FUN_000316 pavi     MCScanX
##  7 ATMG01190   ath        FUN_026738 pavi     MCScanX
##  8 ATMG00910   ath        FUN_040205 pavi     MCScanX
##  9 AT4G39370   ath        FUN_020728 pavi     OrthoDB
## 10 AT3G06350   ath        FUN_020749 pavi     OrthoDB
## 11 AT4G24220   ath        FUN_029917 pavi     OrthoDB
## 12 AT4G24220   ath        FUN_029968 pavi     OrthoDB
## 13 AT1G01030   ath        FUN_025493 pavi     RBH    
## 14 AT1G01040   ath        FUN_011748 pavi     RBH    
## 15 ATMG01250   ath        FUN_040221 pavi     RBH    
## 16 ATMG01360   ath        FUN_026804 pavi     RBH    
## 17 AT2G22690   ath        FUN_021390 pavi     compara
## 18 AT4G37880   ath        FUN_021390 pavi     compara
## 19 AT4G39770   ath        FUN_021012 pavi     compara
## 20 AT4G21740   ath        FUN_032538 pavi     compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4096225 218.8   11915492  636.4  14894365  795.5
## Vcells 62787153 479.1  185724412 1417.0 232155515 1771.3

7.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22167
length(unique(dt$to_geneID))
## [1] 21950
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##    4367   45924   19708   38228   25594
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

7.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

7.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

7.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

7.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "compara"         "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "athName"         "athSynonims"     "all_pathways"   
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 71643     2
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4707333 251.4   11915492  636.4  14894365  795.5
## Vcells 60032963 458.1  185729008 1417.0 232155515 1771.3

7.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

7.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22167
length(unique(df$to_geneID))
## [1] 21950
range(df$from_count)
## [1]   1 122
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 242
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 131
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          56863           8890           5892
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       28791     7489           4881
##   2        9611      946            572
##   3        8287      298            243
##   4        8423      131            163
##   5        1751       26             33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            5359            1923            3416
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                1309                3416                2133                 891 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               19708                3052                5359                1307 
##         RBH_MapMan4              reject 
##                1923               32547
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

7.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          34836           1933           2329
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       11001      732           1533
##   2        6509      770            385
##   3        7254      276            215
##   4        8321      129            163
##   5        1751       26             33
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19897
length(unique(kept$to_geneID))
## [1] 20960
range(kept$from_count)
## [1]  1 59
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 11
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 15
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

7.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                  47                 116                  79                  36 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 789                  84                 116                  61 
##         RBH_MapMan4              reject 
##                  36                1385
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'parm'
  , plantNameOut = "apricot"
  , plantDirOut = file.path('..', 'reports', group, "apricot")
  , flag = 3
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001618d84bd68>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-123] | |…… | 11% | |……. | 15% [unnamed-chunk-124] | |……… | 19% | |……….. | 22% [unnamed-chunk-125] | |…………. | 26% | |…………… | 30% [unnamed-chunk-126] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-127] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-128] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-129] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-130] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-131] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-132] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-133] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-134] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-135] | |…………………………………………..| 100%

cat(child_content)

8 Subsection: parm

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

8.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   43038   18615   52084   25259
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_plant to_geneID       to_plant source 
##    <chr>       <chr>      <chr>           <chr>    <chr>  
##  1 AT1G12040   ath        PruarM.1G000500 parm     FastOMA
##  2 AT1G62440   ath        PruarM.1G000500 parm     FastOMA
##  3 AT2G47410   ath        PruarM.8G368500 parm     FastOMA
##  4 AT4G02060   ath        PruarM.8G368700 parm     FastOMA
##  5 AT1G04400   ath        PruarM.1G051900 parm     MCScanX
##  6 AT1G04410   ath        PruarM.1G052500 parm     MCScanX
##  7 AT5G64410   ath        PruarM.8G146300 parm     MCScanX
##  8 AT5G64410   ath        PruarM.8G146200 parm     MCScanX
##  9 AT5G10270   ath        PruarM.1G279700 parm     OrthoDB
## 10 AT5G64960   ath        PruarM.1G279700 parm     OrthoDB
## 11 AT2G15790   ath        PruarM.8G195500 parm     OrthoDB
## 12 AT4G34660   ath        PruarM.8G163400 parm     OrthoDB
## 13 AT1G01010   ath        PruarM.2G368400 parm     RBH    
## 14 AT1G01030   ath        PruarM.5G193300 parm     RBH    
## 15 ATMG01360   ath        PruarM.4G189900 parm     RBH    
## 16 ATMG01360   ath        PruarM.4G190100 parm     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4146652 221.5   11915492  636.4  14894365  795.5
## Vcells 64254829 490.3  185767249 1417.3 232155515 1771.3

8.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22351
length(unique(dt$to_geneID))
## [1] 22551
table(dt$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   43038   18615   52084   25259
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

8.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

8.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

8.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

8.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "from_count"      "to_count"       
##  [9] "count_evidence"  "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION"
## [13] "athName"         "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 84042
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4048404 216.3   11915492  636.4  14894365  795.5
## Vcells 47943365 365.8  148613800 1133.9 232155515 1771.3

8.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

8.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22351
length(unique(df$to_geneID))
## [1] 22551
range(df$from_count)
## [1]   1 267
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 344
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 392
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          67784          10212           6046
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       41417     8961           4950
##   2        9532      896            600
##   3        8599      227            306
##   4        8236      128            190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##           18336            2176            2959
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH             MCScanX 
##                2959                2357                 842               18615 
## OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH         RBH_MapMan4 
##                3716               18336                1371                2176 
##              reject 
##               33670
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

8.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          46348           1673           2351
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       23953      606           1463
##   2        6492      726            426
##   3        7667      213            272
##   4        8236      128            190
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19826
length(unique(kept$to_geneID))
## [1] 20698
range(kept$from_count)
## [1]   1 261
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 78
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 275
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

8.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH             MCScanX 
##                 125                 110                  23                 768 
## OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH         RBH_MapMan4 
##                 123                 139                  56                  34 
##              reject 
##                1497
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pcox'
  , plantNameOut = "pear"
  , plantDirOut = file.path('..', 'reports', group, "pear")
  , flag = 4
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000161d4175ca8>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-151] | |…… | 11% | |……. | 15% [unnamed-chunk-152] | |……… | 19% | |……….. | 22% [unnamed-chunk-153] | |…………. | 26% | |…………… | 30% [unnamed-chunk-154] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-155] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-156] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-157] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-158] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-159] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-160] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-161] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-162] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-163] | |…………………………………………..| 100%

cat(child_content)

9 Subsection: pcox

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

9.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## FastOMA MCScanX     RBH 
##   75969   33983   36090
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID                     to_plant source 
##    <chr>       <chr>      <chr>                         <chr>    <chr>  
##  1 AT1G20520   ath        Pyrco.da.v2a1.augustus.000230 pcox     FastOMA
##  2 AT1G76210   ath        Pyrco.da.v2a1.augustus.000230 pcox     FastOMA
##  3 AT5G53090   ath        Pyrco.da.v2a1.snap.445350     pcox     FastOMA
##  4 AT5G53100   ath        Pyrco.da.v2a1.snap.445350     pcox     FastOMA
##  5 AT1G03900   ath        Pyrco.da.v2a1.chr10A.089110   pcox     MCScanX
##  6 AT1G03920   ath        Pyrco.da.v2a1.chr10A.089090   pcox     MCScanX
##  7 AT5G56870   ath        Pyrco.da.v2a1.chr9A.225970    pcox     MCScanX
##  8 AT5G57035   ath        Pyrco.da.v2a1.chr9A.225390    pcox     MCScanX
##  9 AT1G01030   ath        Pyrco.da.v2a1.chr14A.371380   pcox     RBH    
## 10 AT1G01030   ath        Pyrco.da.v2a1.chr1A.345960    pcox     RBH    
## 11 ATMG01250   ath        Pyrco.da.v2a1.chr5A.045340    pcox     RBH    
## 12 ATMG01250   ath        Pyrco.da.v2a1.snap.153710     pcox     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3761949 201.0   11915492  636.4  14894365  795.5
## Vcells 52119692 397.7  148652151 1134.2 232155515 1771.3

9.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 21603
length(unique(dt$to_geneID))
## [1] 30838
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##   75969   33983   36090
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

9.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

9.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

9.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

9.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "athName"        
## [13] "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 95885
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3685817 196.9   11915492 636.4  14894365  795.5
## Vcells 41892282 319.7  118921721 907.4 232155515 1771.3

9.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

9.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 21603
length(unique(df$to_geneID))
## [1] 30838
range(df$from_count)
## [1]   1 136
range(df$to_count)
## [1]   1 116
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 261
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 287
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          76754          10595           8536
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       44198     9410           7873
##   2       17208      975            468
##   3       15348      210            195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
##     RBH_MapMan4 FastOMA_MapMan4 
##            4486            8656
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##            8656            5612           33983            4486           43148
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

9.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          46863           3471           2403
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       16410     2355           1804
##   2       15105      906            404
##   3       15348      210            195
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19972
length(unique(kept$to_geneID))
## [1] 30046
range(kept$from_count)
## [1]  1 44
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 12
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 37
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

9.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             332             175            1459             134            1204
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pcer'
  , plantNameOut = "cherryplum"
  , plantDirOut = file.path('..', 'reports', group, "cherryplum")
  , flag = 4
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000016218a7ba58>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-179] | |…… | 11% | |……. | 15% [unnamed-chunk-180] | |……… | 19% | |……….. | 22% [unnamed-chunk-181] | |…………. | 26% | |…………… | 30% [unnamed-chunk-182] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-183] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-184] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-185] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-186] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-187] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-188] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-189] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-190] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-191] | |…………………………………………..| 100%

cat(child_content)

10 Subsection: pcer

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

10.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## FastOMA MCScanX     RBH 
##  162100   63358   80487
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT2G32760   ath        Pcer_000001-RA pcer     FastOMA
##  2 AT1G07920   ath        Pcer_000002-RA pcer     FastOMA
##  3 AT1G16650   ath        Pcer_097557-RA pcer     FastOMA
##  4 AT1G53530   ath        Pcer_097558-RA pcer     FastOMA
##  5 AT1G04220   ath        Pcer_000137-RA pcer     MCScanX
##  6 AT1G04230   ath        Pcer_000150-RA pcer     MCScanX
##  7 ATMG00080   ath        Pcer_091446-RA pcer     MCScanX
##  8 ATMG00513   ath        Pcer_091469-RA pcer     MCScanX
##  9 AT1G01030   ath        Pcer_027461-RA pcer     RBH    
## 10 AT1G01030   ath        Pcer_038773-RA pcer     RBH    
## 11 ATMG01330   ath        Pcer_091451-RA pcer     RBH    
## 12 ATMG01360   ath        Pcer_096779-RA pcer     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3510935 187.6   11915492 636.4  14894365  795.5
## Vcells 44978395 343.2  118921721 907.4 232155515 1771.3

10.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22197
length(unique(dt$to_geneID))
## [1] 71437
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##  162100   63358   80487
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

10.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

10.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

10.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

10.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "athName"        
## [13] "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
##  FALSE   TRUE 
## 201293      3
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  4201617 224.4   11915492 636.4  14894365  795.5
## Vcells 57476412 438.6  118921721 907.4 232155515 1771.3

10.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

10.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22197
length(unique(df$to_geneID))
## [1] 71437
range(df$from_count)
## [1]   1 270
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 890
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 449
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##         166873          19225          15198
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       97070    16707          13585
##   2       39984     1995           1240
##   3       29819      523            373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
##     RBH_MapMan4 FastOMA_MapMan4 
##           12103           25962
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##           25962           18052           63358           12103           81821
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

10.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##         108487           4446           6542
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       42973     2064           5067
##   2       35695     1859           1102
##   3       29819      523            373
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20941
length(unique(kept$to_geneID))
## [1] 69011
range(kept$from_count)
## [1]   1 258
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 282
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 78
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

10.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             956             583            2541             351            3036
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'psib'
  , plantNameOut = "siberianapricot"
  , plantDirOut = file.path('..', 'reports', group, "siberianapricot")
  , flag = 4
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000161cd798320>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-207] | |…… | 11% | |……. | 15% [unnamed-chunk-208] | |……… | 19% | |……….. | 22% [unnamed-chunk-209] | |…………. | 26% | |…………… | 30% [unnamed-chunk-210] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-211] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-212] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-213] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-214] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-215] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-216] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-217] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-218] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-219] | |…………………………………………..| 100%

cat(child_content)

11 Subsection: psib

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

11.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## FastOMA MCScanX     RBH 
##   40732   16398   25288
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID         to_plant source 
##    <chr>       <chr>      <chr>             <chr>    <chr>  
##  1 AT1G12040   ath        PaF106G0100000005 psib     FastOMA
##  2 AT1G62440   ath        PaF106G0100000005 psib     FastOMA
##  3 AT3G07140   ath        PaF106G0800032954 psib     FastOMA
##  4 AT3G07140   ath        PaF106G0800032956 psib     FastOMA
##  5 AT1G04230   ath        PaF106G0100000358 psib     MCScanX
##  6 AT1G04240   ath        PaF106G0100000361 psib     MCScanX
##  7 ATCG00340   ath        PaF106G0700028636 psib     MCScanX
##  8 ATCG00350   ath        PaF106G0700028635 psib     MCScanX
##  9 AT1G01030   ath        PaF106G0500020091 psib     RBH    
## 10 AT1G01040   ath        PaF106G0200009357 psib     RBH    
## 11 ATMG01190   ath        PaF106G0600023358 psib     RBH    
## 12 ATMG01250   ath        PaF106G0700028671 psib     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3867738 206.6    9532394 509.1  14894365  795.5
## Vcells 62767524 478.9  118926637 907.4 232155515 1771.3

11.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 21427
length(unique(dt$to_geneID))
## [1] 19982
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##   40732   16398   25288
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

11.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

11.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

11.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

11.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "athName"        
## [13] "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 53638
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3482212 186.0    9532394 509.1  14894365  795.5
## Vcells 35820380 273.3  118926637 907.4 232155515 1771.3

11.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

11.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 21427
length(unique(df$to_geneID))
## [1] 19982
range(df$from_count)
## [1]  1 58
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 132
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 105
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          44419           5632           3587
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       25281     4776           3224
##   2       11051      631            252
##   3        8087      225            111
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
##     RBH_MapMan4 FastOMA_MapMan4 
##            4572            4786
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##            4786            5843           16398            4572           22039
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

11.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          28419           1706           1474
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       10809      900           1152
##   2        9523      581            211
##   3        8087      225            111
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 18899
length(unique(kept$to_geneID))
## [1] 19230
range(kept$from_count)
## [1]  1 46
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 7
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 16
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

11.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             200             179             674             146             789
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
# Step 1: params_list
# params_list <- list(
# ...
# )
# 
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
#   plantName1: NULL
#   plantName2: NULL
#   plantName3: NULL
#   plantName4: NULL
#   plantDirIn: NULL
#   plantNameOut: NULL
#   plantDirOut: NULL
#   pattern_in: NULL
#   pattern_out: NULL
#   compara_pattern_in1: NULL
#   compara_pattern_in2: NULL
#   plaza_pattern_in1: NULL
#   plaza_pattern_in2: NULL
#   ref_genome: NULL
#   mercator: NULL
#   mercatorPatternIn1: NULL
#   mercatorPatternOut1: NULL
#   mercatorPatternIn2: NULL
#   mercatorPatternOut2: NULL
# ---
# 
# 
# access params in the script like:
# params$plantName1
# params$plantDirOut
# 
# Step 3: Call render() like
# rmarkdown::render(
#   input = "09_fruitTrees-child.Rmd",
#   params = params_list,
#   envir = new.env(parent = globalenv()),  # optional: isolate execution
#   output_format = "html_document",
#   quiet = FALSE
# )
# 
# 
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory

12 SessionInfo

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.utf8  LC_CTYPE=Slovenian_Slovenia.utf8   
## [3] LC_MONETARY=Slovenian_Slovenia.utf8 LC_NUMERIC=C                       
## [5] LC_TIME=Slovenian_Slovenia.utf8    
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_4.0.0      knitr_1.50         data.table_1.17.8 
## [5] magrittr_2.0.4    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.5.1     zip_2.3.3          Rcpp_1.1.0         tidyselect_1.2.1  
##  [9] stringr_1.5.2      jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10       
## [13] fastmap_1.2.0      R6_2.6.1           labeling_0.4.3     generics_0.1.4    
## [17] patchwork_1.3.2    igraph_2.1.4       openxlsx_4.2.8     tibble_3.3.0      
## [21] bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3 rlang_1.1.6       
## [25] utf8_1.2.6         cachem_1.1.0       stringi_1.8.7      xfun_0.53         
## [29] sass_0.4.10        S7_0.2.0           cli_3.6.5          withr_3.0.2       
## [33] digest_0.6.37      grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4   
## [37] vctrs_0.6.5        evaluate_1.0.5     glue_1.8.0         farver_2.1.2      
## [41] colorspace_2.1-1   rmarkdown_2.29     tools_4.5.1        pkgconfig_2.0.3   
## [45] htmltools_0.5.8.1